x = 0.1,
size = 10,
hjust = 0)
# ggdraw(fin)
ggsave("./Plots/Figure2_Color_BW.png", plot_row, width = 36, height = 24, units = "cm")
ggsave("./Plots/Figure2_Color_greyscale.png", plot_row, width = 36, height = 24, units = "cm")
########################################################
# Project:    Commission Communication
# Task:       Create balanced, topics-macthed sample of
#             executive PRs and compare understandability
# Author:     Christian Rauh (01.02.2021)
########################################################
# Intuition
# Worse clarity might hinge on the fact that Comm talks about inherently
# more 'complex' topics, i.e. topic distribution 'confounds' DV
# I thus engage in creating a balanced sample along the topics in COMM and UK/IRE PRS
# Following this work: https://onlinelibrary.wiley.com/doi/full/10.1111/ajps.12526?casa_token=QGbL_R2026UAAAAA%3AFGsvDIMFendg79CHd8o8_LRbLkxz8DoOp1z8tP3TwM7NV9ZBM3GcbewoSYo-zGiu9CsnZ34CWLCcYBg
# Packages ####
library(tidyverse) # 1.3.0
library(Hmisc) # 4.4-1
library(cowplot) # 1.1.0
library(textcat) #1.0-7
library(spacyr) # 1.2.1
library(quanteda) # 3.2.0
library(stm) # 1.3.6
# Textmatching package
# No longer on CRAN because of a dependency issue, dl and install archived version first
# url <- "https://cran.r-project.org/src/contrib/Archive/textmatching/textmatching_0.1.0.tar.gz"
# pkgFile <- "./Tools/textmatching_0.1.0.tar.gz"
# download.file(url = url, destfile = pkgFile)
# install.packages(c("stm", "cem")) # Dependencies
# install.packages(pkgs=pkgFile, type="source", repos=NULL)
library(textmatching)
# # Raw corpus ####
# # Select PRs from a similar time period (for domestic executives, shorter time frame available only)
#
# # Individual corpora
# # Varible construction follows exactly the approach for calculating the clarity indicators
# # See PRs-TextIndicators.R
#
# com <- read_rds("./Corpora/EC-PressReleases_1985-2020_clean.RDS") %>%
#   mutate(sender = "Commission") %>%
#   select(c(sender, date, month, text))
# com$cid <- 1:nrow(com) # Identical id as in extraction of text indicators
#
#
# uk <- read_rds("./Corpora/UK-GovPressReleases.Rds")%>%
#   filter(speech == F,
#          str_count(text, "\\. ") > 2, # faulty scrapes do not contain multiple sentences
#          str_detect(text, "(T|t)he "),
#          !str_detect(text, "PDF, [0-9]{1,4}KB"), # Mostly reference to files only
#          !str_detect(text, "HTML Details"), # Same
#          !str_detect(text, "Kick off time")) %>% # Lists of football travel advice
#   mutate(text = str_remove(paste0(headline, ". ", text), " Curriculum Vitae .*?$")) %>%
#   mutate(sender = "UK") %>%
#   mutate(month = str_remove(as.character(date), "-[0-9]{2}$")) %>%
#   select(c(sender, date, month, text))
# uk$cid <- 1:nrow(uk) # Identical id as in extraction of text indicators
#
#
# ire <- read_rds("./Corpora/IRE-GovPressReleases.Rds")%>%
#   mutate(text = paste0(headline, ". ", text) %>%
#            str_remove(fixed("Share . Email . Facebook . Twitter")) %>%
#            str_remove_all(fixed("•"))) %>%
#   mutate(sender = "IRE") %>%
#   mutate(month = str_remove(as.character(date), "-[0-9]{2}$")) %>%
#   select(c(sender, date, month, text))
# ire$cid <- 1:nrow(ire) # Identical id as in extraction of text indicators
#
# # Filter non-english sentences
# # Irish politicians often use Irish bits which would offset the grammar and lang indicators
#
# for (i in 1:nrow(ire)){
#
#   # Progress
#   print(round((i/nrow(ire))*100, 2))
#
#   # Sentence tokenizer
#   df <- spacy_tokenize(ire$text[i], what = "sentence") %>%
#     data.frame() %>%
#     rename(sentences = 1)
#
#   # Language detection
#   df$lang <- textcat(df$sentences)
#
#   # Drop non-english sentences
#   df <- df %>% filter(lang == "english")
#
#   # Rebuild and store text
#   ire$text[i] <- paste(df$sentences, collapse = " ")
# }
#
#
# # Combine corpora and inspect temporal overlap
# corp <- rbind(com, uk, ire)
#
# # PRs per sender over time
# ggplot(corp, aes(x = month)) +
#   geom_bar(aes(y = (..count..))) +
#   scale_x_discrete(breaks = paste0(seq(1985, 2021, 1), "-01"))+
#   facet_wrap(.~sender, ncol = 1, scales = "free_y") +
#   theme(axis.text.x = element_text(angle = 90, vjust = .5))
#
# ggplot(corp[corp$month > "2006-01", ], aes(x = month)) +
#   geom_bar(aes(y = (..count..))) +
#   scale_x_discrete(breaks = paste0(seq(2006, 2021, 1), "-01"))+
#   facet_wrap(.~sender, ncol = 1, scales = "free_y") +
#   theme(axis.text.x = element_text(angle = 90, vjust = .5))
#
# # So UK consisitently only avaialble from early 2010, IR from 2018 only
# # Both periods in which COMM PR output was declining
# # I choose to cut-off with the beginning of 2010, which suggest an overrepresentation
# # of the UK and an underreprensentation of IRE
# # that shouldn't matter as we match on Com vs both nat executives anyway ...
#
# # Combined 'raw' corpus
#
# raw <- corp %>%
#   filter(month >= "2010-01") %>%
#   mutate(commission = ifelse(sender == "Commission", T, F))
#
# ggplot(raw, aes(x = sender))+
#   geom_bar(aes(y = ..count..))
#
# # Clean up
# # rm(list = c("com", "ire", "uk"))
#
#
# # Merge understandability indicators
#
# # Commission
# com <- raw %>%  filter(sender == "Commission") %>% rename(cid = id)
# comdf <- read_rds("./data/PR-Comm_Language.Rds") %>%
#   select(c(meanSentenceLength, n_verb, n_noun, ntoken, flesch, familiarity)) %>%
#   mutate(cid = row_number())
# com <- merge(com, comdf, by = "cid", all.x = T)
#
# # UK
# uk <- raw %>%  filter(sender == "UK") %>% rename(cid = id)
# ukdf <- read_rds("./data/PR-UK_Language.Rds") %>%
#   select(c(meanSentenceLength, n_verb, n_noun, ntoken, flesch, familiarity)) %>%
#   mutate(cid = row_number())
# uk <- merge(uk, ukdf, by = "cid", all.x = T)
#
# # IRE
# ire <- raw %>%  filter(sender == "IRE") %>% rename(cid = id)
# iredf <- read_rds("./data/PR-IRE_Language.Rds") %>%
#   select(c(meanSentenceLength, n_verb, n_noun, ntoken, flesch, familiarity)) %>%
#   mutate(cid = row_number())
# ire <- merge(ire, iredf, by = "cid", all.x = T)
#
#
# # Re-combine corpus
# raw <- rbind(com, uk, ire)
#
# # Clean up
# # rm(list = c("com", "ire", "uk", "comdf", "ukdf", "iredf"))
#
# # Filter corpus
# # As in generating the main descriptive plot (PRs_DescriptivePlot.R)
# raw <- raw %>%
#   filter(meanSentenceLength > 3, # Average sentence length below three tokens is implausible
#          meanSentenceLength < 150) %>% # Average sentence above 150 tokens is implausible
#   filter(n_noun > 1 & n_verb > 1) %>% # At least one verb and one noun
#   filter(ntoken >= 5) %>%  # At least 5 words
#   filter(text != "")
#
#
# # Date filter (above string filter didn't work in dyplr context)
#
# raw$date2 <- as.Date(raw$date)
# min(raw$date2)
# max(raw$date2)
# raw <- raw %>%  filter(date2 >= as.Date("2010-01-01"))
#
#
# # Export
# write_rds(raw, "./Corpora/Overlap-EC-UK-IRE.rds")
# Reload save object
raw <- read_rds("./Corpora/Overlap-EC-UK-IRE.rds")
table(raw$sender)
min(raw$date)
max(raw$date)
# # Diverging word usage ####
#
# # Relative word frequencies
# corp <- corpus(raw$text, docvars = raw[,c("sender", "commission")])
# mat <- dfm(corp, tolower = T, stem = F, split_hyphens = T,
#            remove_punct = T, remove_numbers = T, remove_symbols = T, remove = stopwords("english"),
#            verbose = T)
# wmat <- dfm_weight(mat, scheme = "prop")
#
# # Get differences in word usage between Comm and nat executives
# # DF of differences in word usages
# typefreq <- dfm_group(mat, groups = docvars(wmat)$commission) %>%    # Group by Commission as sender (see above)
#   dfm_weight(scheme = "prop") %>%
#   t() %>%                                                             # Transpose matrix
#   convert(to = "data.frame") %>%
#   rename(word = doc_id,                                               # Transposition nuissance as first column was doc_id before
#          nat = `FALSE`,
#          com = `TRUE`) %>%
#   mutate(diff = com - nat,                                            # Difference in relative term freqs between Hitler and BT
#          uniqc = ifelse(nat == 0 & com != 0, T, F),                   # Word only used by Com, but not by nat executives
#          uniqn = ifelse(nat != 0 & com == 0, T, F)) %>%               # Word only used by nat executives, but not by Com
#   arrange(desc(diff))
#
# # Unique words in groups
# uniqcom <- typefreq %>% filter(uniqc) %>% arrange(desc(com)) %>% rename(relfreq = com) %>%
#   mutate(label = "Freq. used by the Commission\nbut never by nat. governments\nTop-30") %>% select(word, relfreq, label)
# uniqnat <- typefreq %>% filter(uniqn) %>% arrange(desc(nat)) %>% rename(relfreq = nat) %>%
#   mutate(label = "Freq. used by nat. govs.,\nbut never by the Commission\nTop-30") %>% select(word, relfreq, label)
#
# topuniqwords <- rbind(head(uniqcom, 30),
#                       head(uniqnat, 30))
#
# ggplot(topuniqwords, aes(x=relfreq, y = reorder(word, relfreq)))+
#   geom_col()+
#   facet_wrap(.~label, scales = "free") +
#   labs(title = "Unique Words in Commission and national press releases",
#        x= "Relative word frequencies in texts of respective groups",
#        y= "")+
#   theme_bw()+
#   theme(strip.text = element_text(face = "bold"),
#         axis.text.y = element_text(color = "black"))
#
# ggsave("./Plots/UniqueWords-COMvsNAT.png", width = 16, height = 12, units = "cm")
#
#
# # Rel freq diffs
# typefreq2 <- typefreq %>% filter(!uniqn & !uniqc) %>% # Exclude words that are unique for either group
#   arrange(desc(diff))
#
# strongdiff <- rbind(head(typefreq2, 30),      # Top 30 with highest diff (Com overweight)
#                     tail(typefreq2, 30)) %>%  # Top 30 with lowest diff (Nat overweight)
#   arrange(desc(diff)) %>%
#   mutate(run = rep(seq(1,30,1), 2))          # Sequence to place terms neatly onto the plot (no meaning)
#
# ggplot(strongdiff, aes(x=diff, y = run))+
#   geom_vline(xintercept = 0, linetype = "dashed")+
#   geom_text(aes(label = word))+
#   labs(title = "Which words do nat. governments and the Commission use in different frequencies?",
#        subtitle = "The Top-30 with the strongest differences in frequency, respectively. Only words used by both groups considered here.",
#        x = "Difference in relative word frequency in press releases\nby the Commission and UK/IRE governments",
#        y= "")+
#   theme_bw()+
#   theme(axis.text.y = element_blank(),
#         panel.grid.major.y = element_blank(),
#         panel.grid.minor.y = element_blank(),
#         axis.ticks.y = element_blank())
#
# ggsave("./Plots/DivisiveWords-COMvsNAT.png", width = 22, height = 12, units = "cm")
# Topic model ####
# See: https://raw.githubusercontent.com/bstewart/stm/master/vignettes/stmVignette.pdf
set.seed(20210201)
raw$commission <- ifelse(raw$sender == "Commission", 1, 0)
table(raw$commission)
# Pre-processing text data for stm
# for efficiency reasons, I do this with quanteda
qcorp <- corpus(raw$text, docvars = raw[ ,c("cid", "sender", "commission")])
# DFM: stemmed, lowered, no stopwords
# only words that appear in at least 5% and max 95% of docs
qdfm <- dfm(qcorp, tolower = T,
remove = stopwords("english"),
remove_punct = T,
remove_numbers = T,
remove_symbols = T,
verbose = T) %>%
dfm_trim(min_docfreq = 0.01, max_docfreq = 0.99, docfreq_type = "prop")
# Convert to stm typed object
sdfm <- convert(qdfm, to = "stm")
# Fit model
# # With content co-variate
# topicfit <- stm(documents = sdfm$documents, vocab = sdfm$vocab,
#                 K = 20, content = sdfm$meta$commission,
#                 max.em.its = 75, data = sdfm$meta,
#                 init.type = "Spectral", verbose = T)
# Export topicfit
# write_rds(topicfit, "./Data/STM_fit_OverlapCorpus.rds")
# Reload
topicfit <- read_rds("./Data/STM_fit_OverlapCorpus.rds")
# Assess and store topic content ####
# Overall topic distribution in corpus
# With six most probably words per topic
plot(topicfit, type = "summary", n = 6, xlim = c(0, .3)) #
# Topic words and labels
tlabels <- labelTopics(topicfit)
# Topic disribution across documents
topdstr <- topicfit$theta %>% as.data.frame() # Topic proportions by document
toptop <- colnames(topdstr)[apply(topdstr,1,which.max)] # Topic with highest proportion per doc
raw$toptopic <- toptop # Indicate most likely topic for each text in the overlap sample
# Understandability by topic ####
# Get data on understandability indicators
# by author and main topic of press releases
tu <- raw %>% mutate(verbal = n_verb/n_noun,
commission = as.logical(commission)) %>%
select(c(commission, toptopic, flesch, familiarity, verbal)) %>%
pivot_longer(cols = 3:5)
# Calculate topic means by actor type
nat.agg <- tu %>% filter(commission == 0) %>%
group_by(toptopic, name) %>%
summarise(natmean = mean(value))
com.agg <- tu %>% filter(commission == 1) %>%
group_by(toptopic, name) %>%
summarise(commean = mean(value))
# Get difference between actor types within topic
agg <- merge(nat.agg, com.agg, by = c("toptopic", "name")) %>%
mutate(diff = commean - natmean,
direc = ifelse(diff < 0, "EC PRs with lower average\nthan national PRs", "EC PRs with higher average\nthan national PRs"))
# Construct topic labels with 6 most probable words
twords <- tlabels$topics %>% as.data.frame() %>%
mutate(tnr = row_number())
twords$tlabel <- paste(twords$V1, twords$V2, twords$V3, sep = ", ")
twords$tlabel <- paste0(twords$tlabel, ",\n", twords$V4, ", ", twords$V5, ", ", twords$V6)
twords$tlabel <- paste0("T", twords$tnr, ": ", twords$tlabel)
# ... and merge to understandability data
agg$tnr <- str_remove(agg$toptopic, "V")
agg <- merge(agg, twords[, c("tnr", "tlabel")], by = "tnr", all.x = T)
# Determine labels and order of indicators
agg$name <- agg$name %>%
recode("flesch" = "Easy to read?\nFlesch/Kincaid score",
"familiarity" = "Familiar words?\nAverage Google Books frequency",
"verbal" = "Focus on action?\nRatio of verbs to nouns")
agg$name2 <- factor(agg$name,
levels = c("Easy to read?\nFlesch/Kincaid score",
"Familiar words?\nAverage Google Books frequency",
"Focus on action?\nRatio of verbs to nouns"))
# Determine topic order
toporder <- agg %>%  select(tnr, tlabel) %>% unique() %>% arrange(desc(as.numeric(tnr)))
agg$tlabel <- agg$tlabel %>% factor(levels = toporder$tlabel)
rm(toporder)
# Plot difference in understandability indicators by topic
ggplot(data = agg, aes(y = tlabel, x = diff, fill = direc))+
geom_rect(ymin =.5, ymax = 1.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =2.5, ymax = 3.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =4.5, ymax = 5.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =6.5, ymax = 7.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =8.5, ymax = 9.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =10.5, ymax = 11.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =12.5, ymax = 13.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =14.5, ymax = 15.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =16.5, ymax = 17.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =18.5, ymax = 19.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_col(width = .5)+
# geom_linerange(aes(xmin = Lower, xmax = Upper), position = position_dodge(width = .5))+
geom_vline(xintercept = 0, linetype = "dotted")+
labs(y = "Estimated main topic\nof press releases\n",
x = "\nAverage difference between press releases of the European Commission and nat. governments (UK/IRE)\non the respective indicator\n",
fill = "",
caption = "Based on full texts of 103,443 press releases (Commission: 11,837; UK: 84,940; IRE: 6,666) published between January 2010 and January 2021.\nTopic classification based on a Structural Topic Model with a content covariate accounting for wording differences between Commission and national PRs.\nLabels show the six most probable words within each topic.")+
scale_fill_manual(values = c("#003399", "#FFCC00"))+
facet_grid(.~name2, scales = "free_x")+
theme_bw()+
theme(legend.position = "bottom",
strip.text = element_text(face = "bold"),
axis.text.y = element_text(hjust = 0),
text = element_text(family = "serif"))
ggsave("./Plots/Fig_3_color.png", width = 24, height = 20, units = "cm")
ggsave("./Plots/Figure_3_color.png", width = 24, height = 20, units = "cm")
ggsave("./Plots/Figure3_color.png", width = 24, height = 20, units = "cm")
# Plot difference in understandability indicators by topic (Figure 3)
# Greyscale version
ggplot(data = agg, aes(y = tlabel, x = diff, fill = direc))+
geom_rect(ymin =.5, ymax = 1.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =2.5, ymax = 3.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =4.5, ymax = 5.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =6.5, ymax = 7.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =8.5, ymax = 9.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =10.5, ymax = 11.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =12.5, ymax = 13.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =14.5, ymax = 15.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =16.5, ymax = 17.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =18.5, ymax = 19.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_col(width = .5)+
# geom_linerange(aes(xmin = Lower, xmax = Upper), position = position_dodge(width = .5))+
geom_vline(xintercept = 0, linetype = "dotted")+
labs(y = "Estimated main topic\nof press releases\n",
x = "\nAverage difference between press releases of the European Commission and nat. governments (UK/IRE)\non the respective indicator\n",
fill = "",
caption = "Based on full texts of 103,443 press releases (Commission: 11,837; UK: 84,940; IRE: 6,666) published between January 2010 and January 2021.\nTopic classification based on a Structural Topic Model with a content covariate accounting for wording differences between Commission and national PRs.\nLabels show the six most probable words within each topic.")+
scale_fill_manual(values = c("black", "grey50"))+
facet_grid(.~name2, scales = "free_x")+
theme_bw()+
theme(legend.position = "bottom",
strip.text = element_text(face = "bold"),
axis.text.y = element_text(hjust = 0),
text = element_text(family = "serif"))
ggsave("./Plots/Figure3_greyscale.png", width = 24, height = 20, units = "cm")
# Plot difference in understandability indicators by topic (Figure 3)
# Color version
ggplot(data = agg, aes(y = tlabel, x = diff, fill = direc))+
geom_rect(ymin =.5, ymax = 1.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =2.5, ymax = 3.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =4.5, ymax = 5.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =6.5, ymax = 7.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =8.5, ymax = 9.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =10.5, ymax = 11.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =12.5, ymax = 13.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =14.5, ymax = 15.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =16.5, ymax = 17.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =18.5, ymax = 19.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_col(width = .5)+
# geom_linerange(aes(xmin = Lower, xmax = Upper), position = position_dodge(width = .5))+
geom_vline(xintercept = 0, linetype = "dotted")+
labs(y = "Estimated main topic\nof press releases\n ",
x = " \nAverage difference between press releases of the European Commission and nat. governments (UK/IRE)\non the respective indicator\n ",
fill = "",
caption = "Based on full texts of 103,443 press releases (Commission: 11,837; UK: 84,940; IRE: 6,666) published between January 2010 and January 2021.\nTopic classification based on a Structural Topic Model with a content covariate accounting for wording differences between Commission and national PRs.\nLabels show the six most probable words within each topic.")+
scale_fill_manual(values = c("#003399", "#FFCC00"))+
facet_grid(.~name2, scales = "free_x")+
theme_bw()+
theme(legend.position = "bottom",
strip.text = element_text(face = "bold"),
axis.text.y = element_text(hjust = 0),
text = element_text(family = "serif"))
ggsave("./Plots/Figure3_color.png", width = 24, height = 20, units = "cm")
# Plot difference in understandability indicators by topic (Figure 3)
# Greyscale version
ggplot(data = agg, aes(y = tlabel, x = diff, fill = direc))+
geom_rect(ymin =.5, ymax = 1.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =2.5, ymax = 3.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =4.5, ymax = 5.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =6.5, ymax = 7.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =8.5, ymax = 9.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =10.5, ymax = 11.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =12.5, ymax = 13.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =14.5, ymax = 15.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =16.5, ymax = 17.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_rect(ymin =18.5, ymax = 19.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
geom_col(width = .5)+
# geom_linerange(aes(xmin = Lower, xmax = Upper), position = position_dodge(width = .5))+
geom_vline(xintercept = 0, linetype = "dotted")+
labs(y = "Estimated main topic\nof press releases\n ",
x = " \nAverage difference between press releases of the European Commission and nat. governments (UK/IRE)\non the respective indicator\n ",
fill = "",
caption = "Based on full texts of 103,443 press releases (Commission: 11,837; UK: 84,940; IRE: 6,666) published between January 2010 and January 2021.\nTopic classification based on a Structural Topic Model with a content covariate accounting for wording differences between Commission and national PRs.\nLabels show the six most probable words within each topic.")+
scale_fill_manual(values = c("black", "grey50"))+
facet_grid(.~name2, scales = "free_x")+
theme_bw()+
theme(legend.position = "bottom",
strip.text = element_text(face = "bold"),
axis.text.y = element_text(hjust = 0),
text = element_text(family = "serif"))
ggsave("./Plots/Figure3_greyscale.png", width = 24, height = 20, units = "cm")
# CEM matching ####
refitted <- refit(topicfit, sdfm$documents, content_level = "1")
projection <- project(topicfit, sdfm$documents)
matched <- cem_match(refitted,projection=projection, sdfm$meta$commission,
projection_breaks=2)
matched
# Get indices of matched texts
sample.indices <- matched$matched %>%
as.data.frame() %>%
rename(matched = 1) %>%
mutate(index = row_number()) %>%
filter(matched == T)
# Get matched sample data
msample <- raw[c(sample.indices$index), ]
# Compare understandability indicators
# in matched sample
df <- msample %>%
mutate(verbal = n_verb/n_noun,
author = ifelse(commission ==1, "European Commission", "Nat. government (UK/IRE)")) %>%
select(author, flesch, familiarity, verbal) %>%
pivot_longer(cols = 2:4)
df$name <- df$name %>%
recode("flesch" = "Easy to read?\nFlesch/Kincaid score",
"familiarity" = "Familiar words?\nAverage Google Books frequency",
"verbal" = "Focus on action?\nRatio of verbs to nouns")
df$name2 <- factor(df$name,
levels = c("Easy to read?\nFlesch/Kincaid score",
"Familiar words?\nAverage Google Books frequency",
"Focus on action?\nRatio of verbs to nouns"))
ggplot(df, aes(x = value, y = author, colour = author))+
stat_summary(geom = "linerange", fun.data = "mean_cl_boot", size = 1.1) +
stat_summary(geom = "point", fun = "mean", size = 2.5) +
facet_wrap(.~name2, scales = "free_x", ncol = 1)+
scale_color_manual(values = c("#003399", "#FFCC00"))+
labs(title = "Comparing language of Commission and national press releases\nin a sample matched on topic distributions in full texts\n",
x = "Mean value of respective indicator and 95% c.i.s",
y = "",
color = "Press release author: ",
caption = "Coarsened exact match on topic distributions from a k=20 STM with a content covariate accounting for wording differences between Commission and national press releases.\n82,132 press releases (Commission: 11,321; Nat. gov.: 70,811) published between January 2010 and January 2021.")+
theme_bw()+
theme(legend.position = "bottom",
axis.text.y = element_blank(),
strip.text = element_text(face = "bold"),
text = element_text(family = "serif"),
plot.title = element_text(hjust = .5, face = "bold"))
ggsave("./Plots/FigureA4_TopicMatchedComparision.png", width = 24, height = 15, units = "cm")
ggplot(df, aes(x = value, y = author, colour = author))+
stat_summary(geom = "linerange", fun.data = "mean_cl_boot", size = 1.1) +
stat_summary(geom = "point", fun = "mean", size = 2.5) +
facet_wrap(.~name2, scales = "free_x", ncol = 1)+
scale_color_manual(values = c("#003399", "#FFCC00"))+
labs(title = "Comparing language of Commission and national press releases\nin a sample matched on topic distributions in full texts\n ",
x = "Mean value of respective indicator and 95% c.i.s",
y = "",
color = "Press release author: ",
caption = "Coarsened exact match on topic distributions from a k=20 STM with a content covariate accounting for wording differences between Commission and national press releases.\n82,132 press releases (Commission: 11,321; Nat. gov.: 70,811) published between January 2010 and January 2021.")+
theme_bw()+
theme(legend.position = "bottom",
axis.text.y = element_blank(),
strip.text = element_text(face = "bold"),
text = element_text(family = "serif"),
plot.title = element_text(hjust = .5, face = "bold"))
ggsave("./Plots/FigureA4_TopicMatchedComparision.png", width = 24, height = 15, units = "cm")
`EC-PressReleases_1985-2020_clean` <- readRDS("D:/WZB-Nextcloud/WZB_CR/Datensaetze/CommissionCommunication - JEI/Corpora/EC-PressReleases_1985-2020_clean.RDS")
`IRE-GovPressReleases` <- readRDS("D:/WZB-Nextcloud/WZB_CR/Datensaetze/CommissionCommunication - JEI/Corpora/IRE-GovPressReleases.Rds")
`UK-GovPressReleases` <- readRDS("D:/WZB-Nextcloud/WZB_CR/Datensaetze/CommissionCommunication - JEI/Corpora/UK-GovPressReleases.Rds")
PolSciAbstracts <- readRDS("D:/WZB-Nextcloud/WZB_CR/Datensaetze/CommissionCommunication - JEI/Corpora/PolSciAbstracts.rds")
BNC_RawTexts <- readRDS("D:/WZB-Nextcloud/WZB_CR/Datensaetze/CommissionCommunication - JEI/Corpora/BNC_RawTexts.rds")
R.Version()
